#!/usr/bin/env python

import argparse
import os
import time
import datetime
import json
import pandas as pd
import collections
import gzip
import cPickle as pickle
import itertools
import re


parser = argparse.ArgumentParser(
    description='Select variables from text file, line-by-line')
parser.add_argument(
    '--info-file',
    metavar='INFO',
    default='/home/whobbs/workspace/cavr_wkdir/CalVoter_scripts/' +\
        'california_files_json.txt',
    help='information about variables, file location, and input separator'
    )
parser.add_argument(
    '--out-directory',
    metavar='DIR',
    default='~/workspace/cavr_wkdir/CalVoter_files/retrynames/',
    help='local directory to write outfile'
    )
parser.add_argument(
    '--chunksize',
    metavar='CHUNK',
    type=int,
    default=100000,
    help='rows per iteration'
    )
parser.add_argument(
    '--first-run',
    metavar='FIRST',
    default=True,
    help='rows per iteration'
    )
args = parser.parse_args()

json_data = open(os.path.expanduser(args.info_file))
file_info = json.load(json_data)

the_chunksize = args.chunksize

# from stackoverflow
def format_text(text):
    try:
        return text.upper().lower().strip(",").strip(":").strip(" ")
    except AttributeError:
        return text

def merge_dols(dol1, dol2):
  result = dict(dol1, **dol2)
  result.update((k, dol1[k] + dol2[k])
                for k in set(dol1).intersection(dol2))
  return result

ineligible_pids = []

#### summarize PIDs and HIDs for each file-year
print "Finding PID duplicates and high HID occupancies:"
for the_year in sorted(file_info.keys(), reverse=True):

    infile = file_info[the_year][0]['filename']
    pids = file_info[the_year][0]['pid']
    pids2 = file_info[the_year][0]['pid2']
    pids3 = file_info[the_year][0]['pid3']
    dobs = file_info[the_year][0]['dob']
    unique_pid = file_info[the_year][0]['unique_pid'] == 'True'
    # currlast = file_info[the_year][0]['currlast']
    prevlast = file_info[the_year][0]['prevlast']
    hids = file_info[the_year][0]['hid']    
    keep = file_info[the_year][0]['keep']
    variables = list(set(pids + pids2 + pids3 + [prevlast] + hids + [dobs] + keep))
    the_sep = file_info[the_year][0]['sep']

    DATAReader = pd.read_csv(
        infile,
        quoting=0,
        sep=the_sep,
        # nrows=100000,
        # iterator=True,
        chunksize = the_chunksize,
        usecols=variables,
        # na_values="ZZNAZZ",
        # keep_default_na=False,
        dtype=object,
        na_filter=False,
        error_bad_lines=False   # SKIPS bad lines
        )
    
    the_chunk = 0
    
    pid_counts = collections.Counter()
    pid_counts2 = collections.Counter()
    hid_counts = collections.Counter()
    hid_to_pid = collections.defaultdict()
    # pid2_to_pid = collections.defaultdict()

    start = time.time()
    # 
    print "  " + the_year + ".. " + "(current time: " + \
	datetime.datetime.fromtimestamp(
        time.time()
        ).strftime('%Y-%m-%d %H:%M:%S') + ")"

   
    for DATA in DATAReader:     # chunking not necessary (?)
        if the_chunk % the_chunksize == 0:
            print "    " + str(the_chunk) + ' (' + str(len(DATA.index)) + ')'

        DATA_columns_orig = DATA.columns
        DATA = DATA.applymap(format_text)

        DATA["pid"] = DATA.apply(
            lambda row: '\t'.join(map(str, row[pids])), axis=1
            )
        DATA["pid2"] = DATA.apply(
            lambda row: '\t'.join(map(str, row[pids2])), axis=1
            )
        DATA["pid3"] = DATA.apply(
            lambda row: '\t'.join(map(str, row[pids3])), axis=1
            )        

        hid_counts = hid_counts + collections.Counter(
            DATA.apply(lambda row: '\t'.join(map(str, row[hids])), axis=1)
            )
        if not unique_pid:
            pid_counts = pid_counts + collections.Counter(
                DATA["pid"]
            )
        elif file_info[the_year][0]['pidtype'] == 'bad':
            pid_counts2 = pid_counts2 + collections.Counter(
            DATA.apply(lambda row: '\t'.join(map(str, row[pids2])), axis=1)
            )

        for i, row in DATA.iterrows():
            the_hid = "\t".join(map(str, row[hids]))
            the_pid = "\t".join(map(str, row[pids]))
            # the_pid2 = "\t".join(map(str, row[pids2]))
            if the_hid in hid_to_pid:
                hid_to_pid[the_hid].update({the_pid:row[dobs]})
            else:
                hid_to_pid[the_hid] = {the_pid:row[dobs]}
            # if the_pid2 in pid2_to_pid:
            #     pid2_to_pid[the_pid2].extend(the_pid)
            # else:
            #     pid2_to_pid[the_pid2] = [the_pid]

        
        DATA_columns_orig = [
            (x.lower()+'_'+the_year.lower()).replace('.','').replace(' ', '') for x in DATA_columns_orig
            ]
        DATA.columns = DATA_columns_orig + ['pid'] + ['pid2'] + ['pid3']
        # if len(pids) > 1:
        DATA_columns_orig.remove('previouslastname_'+the_year.lower())
        the_order = ["pid"]+["pid2"]+['pid3']+['previouslastname_'+the_year.lower()]+DATA_columns_orig
        # else:
        #     the_order = DATA_columns_orig[:(len(DATA.columns)-1)]

        if the_chunk==0:
            DATA[the_order].to_csv(
                os.path.expanduser(
                    args.out_directory +\
                        "formattedByIndividual_" + \
                        the_year + ".csv"), 
                header=True, index=False
                )
            DATA.loc[(DATA['previouslastname'+'_'+the_year.lower()] != "")][['pid','pid2']].to_csv(
                os.path.expanduser(
                    args.out_directory +\
                        "formattedByIndividual_altIDs_" + \
                        the_year + ".csv"), 
                header=True, index=False
                )
        else:
            DATA[the_order].to_csv(
                os.path.expanduser(
                    args.out_directory +\
                        "formattedByIndividual_" + \
                        the_year + ".csv"), 
                header=False, index=False, mode='a'
                )
            DATA.loc[(DATA['previouslastname'+'_'+the_year.lower()] != "")][['pid','pid2']].to_csv(
                os.path.expanduser(
                    args.out_directory +\
                        "formattedByIndividual_altIDs_" + \
                        the_year + ".csv"), 
                header=False, index=False, mode='a'
                )
        
        the_chunk += the_chunksize
                    
    print "    collecting file-year duplicated PIDs.."
    the_dup_pids = list(
        key
        for key, value 
        in pid_counts.iteritems() if value > 1
        )
    if file_info[the_year][0]['pidtype'] == 'backup':
        the_dup_pids2 = list(
            key
            for key, value 
            in pid_counts2.iteritems() if value > 1
            )
    
    print "    collecting file-year high occupancy HIDs.."
    high_occ_hids = set(list(
        key
        for key, value in hid_counts.iteritems() 
        if value > 6
        ))
    
    print "    extending cross-year ineligible PIDs.."

    ineligible_pids.extend(the_dup_pids)
    ineligible_pids.extend([
            hid_to_pid[key].keys() for key in high_occ_hids
        ])
    # if file_info[the_year][0]['pidtype'] == 'backup':
    #     ineligible_pids.extend([pid2_to_pid[k] for k in the_dup_pids2])

    print "    pickling.."
    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "dup_pids_" + \
                the_year + \
                ".pgz"
            ), 'wb') as f:
        pickle.dump(the_dup_pids, f)
        
    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "high_occ_hids_" + \
                the_year + \
                ".pgz"
            ), 'wb') as f:
        pickle.dump(high_occ_hids, f)

    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "hid_to_pid_" + \
                the_year + \
                ".pgz"
            ), 'wb') as f:
        pickle.dump(hid_to_pid, f)

    
    print "  Done." + " (time spent on file-year: " + str(datetime.timedelta(seconds=time.time() - start)) + ".)\n"


for the_year in sorted(file_info.keys(), reverse=True):
    print "  " + the_year + ".. " + "(current time: " + \
	datetime.datetime.fromtimestamp(
        time.time()
        ).strftime('%Y-%m-%d %H:%M:%S') + ")"

    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "dup_pids_" + \
                the_year + \
                ".pgz"
            ), 'rb'
        ) as f:
        the_dup_pids = pickle.load(f)

    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "high_occ_hids_" + \
                the_year + \
                ".pgz"
            ), 'rb'
        ) as f:
        high_occ_hids = pickle.load(f)

    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "hid_to_pid_" + \
                the_year + \
                ".pgz"
            ), 'rb'
        ) as f:
        hid_to_pid = pickle.load(f)

    ineligible_pids.extend(the_dup_pids)
    ineligible_pids.extend([
        value.keys()
        for key, value in hid_to_pid.iteritems() if key in high_occ_hids
        ][0])                   # [0] because value.keys() is a list


print "Pickling ineligible PIDs..\n"
ineligible_pids = [item for sublist in ineligible_pids for item in sublist]
ineligible_pids = set(ineligible_pids)
with gzip.open(
    os.path.expanduser(
        args.out_directory +\
            "ineligible_pids_" + \
            file_info.keys()[0][0:4] + \
            ".pgz"
        ), 'wb') as f:
    pickle.dump(ineligible_pids, f)
    

# with gzip.open(
#     os.path.expanduser(
#         args.out_directory +\
#             "ineligible_pids_" + \
#             file_info.keys()[0][0:4] + \
#             ".pgz"
#         ), 'rb'
#     ) as f:
#     ineligible_pids = pickle.load(f)

# ineligible_pids = [item for sublist in ineligible_pids for item in sublist]
# ineligible_pids = set(ineligible_pids)

#### collect all ineligible voters,
#### along with and all voters living with ineligible voter in any file-year
all_ineligible_pids = []
print "Collecting all ineligible PIDs:"
for the_year in sorted(file_info.keys(), reverse=True):
    start = time.time()
    print "  " + the_year + ".. " + " (current time: " + \
	datetime.datetime.fromtimestamp(
        time.time()
        ).strftime('%Y-%m-%d %H:%M:%S') + ")"
    with gzip.open(
        os.path.expanduser(
            args.out_directory +\
                "hid_to_pid_" + \
                the_year + \
                ".pgz"
            ), 'rb'
        ) as f:
        hid_to_pid = pickle.load(f)
    print "    collecting all, file-year ineligible HIDs.."
    pid_to_hid = collections.defaultdict()
    # flip hid to pid (to PID:HID)
    for k, v in hid_to_pid.items():
        for subk, subv in v.items():
            pid_to_hid[subk] = k
    # get ineligible hids (and flip again)
    ineligible_pids_year =  ineligible_pids.intersection(set(pid_to_hid.keys()))
    ineligible_hids = set([pid_to_hid[k] for k in ineligible_pids_year])
    # take out the PID lists for each ineligible household
    ineligible_pids_by_hids = list(hid_to_pid[k] for k in ineligible_hids)
    # 
    print "    extending cross-year ineligible PIDs (with households).."
    all_ineligible_pids.extend(
        list(set( 
                list(itertools.chain(*ineligible_pids_by_hids))
                ))
        )
    print "  Done." + " (time spent on file-year: " + str(datetime.timedelta(seconds=time.time() - start)) + ".)\n"


print "Pickling all ineligible PIDs..\n"
all_ineligible_pids = set(all_ineligible_pids)
with gzip.open(
    os.path.expanduser(
        args.out_directory +\
            "all_ineligible_pids_" + \
            file_info.keys()[0][0:4] + \
            ".pgz"
        ), 'wb') as f:
    pickle.dump(all_ineligible_pids, f)
